Correlation between functional selections on humanDAG1 and mastomysDAG1 cells¶
Compare functional scores measured on cells expressing humanDAG1 and mastomysDAG1
In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import altair as alt
# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
File paths for data:
In [2]:
# this cell is tagged as `parameters` for papermill parameterization
HEK293T_data_path = None
humanDAG1_data_path = None
mastomysDAG1_data_path = None
# Minimum times seen filter
MTS = None
n_selections = None
html_dir = None
html_output = None
In [3]:
# Parameters
HEK293T_data_path = "results/func_effects/averages/293T_entry_func_effects.csv"
humanDAG1_data_path = "results/func_effects/averages/human_293T_entry_func_effects.csv"
mastomysDAG1_data_path = (
"results/func_effects/averages/mastomys_293T_entry_func_effects.csv"
)
MTS = 2
n_selections = 8
html_dir = "results/DAG1_ortholog_correlations/"
html_output = "results/DAG1_ortholog_correlations/DAG1_ortholog_correlations.html"
In [4]:
# # Uncomment for running interactive
# HEK293T_data_path = "../results/func_effects/averages/293T_entry_func_effects.csv"
# humanDAG1_data_path = "../results/func_effects/averages/human_293T_entry_func_effects.csv"
# mastomysDAG1_data_path = "../results/func_effects/averages/mastomys_293T_entry_func_effects.csv"
# # Minimum times seen filter
# MTS = 2
# n_selections = 8
# html_dir = "../results/DAG1_ortholog_correlations/"
# html_output = "../results/DAG1_ortholog_correlations/DAG1_ortholog_correlations.html"
In [5]:
# Read data
hek_df = pd.read_csv(HEK293T_data_path)
human_df = pd.read_csv(humanDAG1_data_path)
mastomys_df = pd.read_csv(mastomysDAG1_data_path)
# Merge data on intersection of measured values
merged_df = (
human_df.merge(
mastomys_df,
how="inner",
on=["site", "wildtype", "mutant"],
suffixes=["_human", "_mastomys"],
validate="one_to_one",
)
.merge(
hek_df,
how="inner",
on=["site", "wildtype", "mutant"],
validate="one_to_one",
)
)
merged_df = merged_df.rename(columns={
"effect" : "effect_HEK293T",
"times_seen" : "times_seen_HEK293T",
"n_selections" : "n_selections_HEK293T",
})
# Add average times seen column
merged_df["average_times_seen"] = merged_df[["times_seen_human", "times_seen_mastomys", "times_seen_HEK293T"]].mean(axis=1)
# Filter for number of selections
merged_df = (
merged_df.loc[
(merged_df["n_selections_human"] == n_selections)
&
(merged_df["n_selections_mastomys"] == n_selections)
&
(merged_df["n_selections_HEK293T"] == n_selections)
]
)
Plot correlation of scores with an interactive plot
In [6]:
# Calculate statistics
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation human vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation human vs HEK (min_times_seen={MTS}): {r**2:.2f}")
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation mastomys vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs HEK (min_times_seen={MTS}): {r**2:.2f}")
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"])
print(f"r correlation mastomys vs human (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs human (min_times_seen={MTS}): {r**2:.2f}")
slider = alt.binding_range(min=1, max=25, step=1, name="times_seen")
selector = alt.param(name="SelectorName", value=MTS, bind=slider)
# Plot data
human_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
alt.X(
"effect_human",
axis=alt.Axis(
title=["effect on cell entry measured in", "293T\u0394DAG1+humanDAG1 cells"],
values=[-5,-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-5.5,1.5])
),
alt.Y(
"effect_HEK293T",
axis=alt.Axis(
title="effect on cell entry measured in 293T cells",
values=[-5,-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-5.5,1.5])
),
opacity=alt.condition(
alt.datum.average_times_seen < selector,
alt.value(0),
alt.value(0.1)
),
tooltip=[
"site",
"wildtype",
"mutant",
"effect_human",
"times_seen_human",
"n_selections_human",
"effect_HEK293T",
"times_seen_HEK293T",
"n_selections_HEK293T"
],
).properties(
width=300,
height=300
)
mastomys_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
alt.X(
"effect_mastomys",
axis=alt.Axis(
title=["effect on cell entry measured in", "293T\u0394DAG1+mastomysDAG1 cells"],
values=[-5,-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-5.5,1.5])
),
alt.Y(
"effect_HEK293T",
axis=alt.Axis(
title="effect on cell entry measured in 293T cells",
values=[-5,-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-5.5,1.5])
),
opacity=alt.condition(
alt.datum.average_times_seen < selector,
alt.value(0),
alt.value(0.1)
),
tooltip=[
"site",
"wildtype",
"mutant",
"effect_mastomys",
"times_seen_mastomys",
"n_selections_mastomys",
"effect_HEK293T",
"times_seen_HEK293T",
"n_selections_HEK293T",
],
).properties(
width=300,
height=300
)
mastomys_vs_human = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
alt.X(
"effect_human",
axis=alt.Axis(
title=["effect on cell entry measured in", "293T\u0394DAG1+humanDAG1 cells"],
values=[-5,-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-5.5,1.5])
),
alt.Y(
"effect_mastomys",
axis=alt.Axis(
title=["effect on cell entry measured in", "293T\u0394DAG1+mastomysDAG1 cells"],
values=[-5,-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-5.5,1.5])
),
opacity=alt.condition(
alt.datum.average_times_seen < selector,
alt.value(0),
alt.value(0.1)
),
tooltip=[
"site",
"wildtype",
"mutant",
"effect_human",
"times_seen_human",
"n_selections_human",
"effect_mastomys",
"times_seen_mastomys",
"n_selections_mastomys"
],
).properties(
width=300,
height=300,
)
corr_chart = alt.hconcat(
human_vs_hek,
mastomys_vs_hek,
mastomys_vs_human,
spacing=5,
title="Correlations of functional selections for DAG1 orthologs",
).add_params(
selector
).configure_axis(
grid=False,
labelFontSize=16,
titleFontSize=16,
labelFontWeight="normal",
titleFontWeight="normal",
).configure_title(
fontSize=24,
)
# Make output dir if doesn't exist
if not os.path.exists(html_dir):
os.mkdir(html_dir)
print(f"Saving to {html_output}")
corr_chart.save(html_output)
corr_chart
r correlation human vs HEK (min_times_seen=2): 0.93 r^2 correlation human vs HEK (min_times_seen=2): 0.87 r correlation mastomys vs HEK (min_times_seen=2): 0.93 r^2 correlation mastomys vs HEK (min_times_seen=2): 0.87 r correlation mastomys vs human (min_times_seen=2): 0.94 r^2 correlation mastomys vs human (min_times_seen=2): 0.89 Saving to results/DAG1_ortholog_correlations/DAG1_ortholog_correlations.html
Out[6]:
Recreate same plot as above but reduce font sizes for a figure in a manuscript
In [7]:
# Calculate statistics
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation human vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation human vs HEK (min_times_seen={MTS}): {r**2:.2f}")
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation mastomys vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs HEK (min_times_seen={MTS}): {r**2:.2f}")
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"])
print(f"r correlation mastomys vs human (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs human (min_times_seen={MTS}): {r**2:.2f}")
slider = alt.binding_range(min=1, max=25, step=1, name="times_seen")
selector = alt.param(name="SelectorName", value=MTS, bind=slider)
# Plot data
human_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
alt.X(
"effect_human",
axis=alt.Axis(
title=["effect on cell entry measured in", "293T\u0394DAG1+humanDAG1 cells"],
values=[-5,-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-5.5,1.5])
),
alt.Y(
"effect_HEK293T",
axis=alt.Axis(
title="effect on cell entry measured in 293T cells",
values=[-5,-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-5.5,1.5])
),
opacity=alt.condition(
alt.datum.average_times_seen < selector,
alt.value(0),
alt.value(0.1)
),
tooltip=[
"site",
"wildtype",
"mutant",
"effect_human",
"times_seen_human",
"n_selections_human",
"effect_HEK293T",
"times_seen_HEK293T",
"n_selections_HEK293T"
],
).properties(
width=115,
height=115
)
mastomys_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
alt.X(
"effect_mastomys",
axis=alt.Axis(
title=["effect on cell entry measured in", "293T\u0394DAG1+mastomysDAG1 cells"],
values=[-5,-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-5.5,1.5])
),
alt.Y(
"effect_HEK293T",
axis=alt.Axis(
title="effect on cell entry measured in 293T cells",
values=[-5,-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-5.5,1.5])
),
opacity=alt.condition(
alt.datum.average_times_seen < selector,
alt.value(0),
alt.value(0.1)
),
tooltip=[
"site",
"wildtype",
"mutant",
"effect_mastomys",
"times_seen_mastomys",
"n_selections_mastomys",
"effect_HEK293T",
"times_seen_HEK293T",
"n_selections_HEK293T",
],
).properties(
width=115,
height=115
)
mastomys_vs_human = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
alt.X(
"effect_human",
axis=alt.Axis(
title=["effect on cell entry measured in", "293T\u0394DAG1+humanDAG1 cells"],
values=[-5,-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-5.5,1.5])
),
alt.Y(
"effect_mastomys",
axis=alt.Axis(
title=["effect on cell entry measured in", "293T\u0394DAG1+mastomysDAG1 cells"],
values=[-5,-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-5.5,1.5])
),
opacity=alt.condition(
alt.datum.average_times_seen < selector,
alt.value(0),
alt.value(0.1)
),
tooltip=[
"site",
"wildtype",
"mutant",
"effect_human",
"times_seen_human",
"n_selections_human",
"effect_mastomys",
"times_seen_mastomys",
"n_selections_mastomys"
],
).properties(
width=115,
height=115
)
corr_chart = alt.hconcat(
human_vs_hek,
mastomys_vs_hek,
mastomys_vs_human,
spacing=5
).add_params(
selector
).configure_axis(
grid=False,
labelFontSize=8,
titleFontSize=8,
labelFontWeight="normal",
titleFontWeight="normal",
).configure_point(
size=10
)
corr_chart
r correlation human vs HEK (min_times_seen=2): 0.93 r^2 correlation human vs HEK (min_times_seen=2): 0.87 r correlation mastomys vs HEK (min_times_seen=2): 0.93 r^2 correlation mastomys vs HEK (min_times_seen=2): 0.87 r correlation mastomys vs human (min_times_seen=2): 0.94 r^2 correlation mastomys vs human (min_times_seen=2): 0.89
Out[7]: